
********************************************************************
***** DATA CREATION OF SOCIOLOGY:  GENDER AND COLLABORATION (LORENZO DUCTOR, SANJEEV GOYAL AND ANJA PRUMMER)
***** Start: 13.02.2020
***** Last change:  10.09.2021
***** CREATED BY: Lorenzo Ductor (lductor@ugr.es)   
***** OBJECTIVE: do file to create the main file for section A of the appendix: sociologydatabase.dta
***** Imput: 
***** i) sociology.dta; it includes all the articles published in Sociological Abstracts
***** ii) authors.dta, it includes the authorid and authorname
***** Output:
***** i) sociologydatabase.dta; it includes all the variables for the analysis presented in Section A of the Online Appendix.
********************************************************************


/* 1) CREATING STRENGTH VARIABLES*/
import delimited sociology, clear
gen year2=.
save sociology, replace 
forvalues i=1963/1999{
use sociology, clear
drop if year<`i'-4
keep if year<`i'+1
drop if auth2==. & nauthors==2
drop if auth3==. & nauthors==3
drop if auth4==. & nauthors==4
drop if auth5==. & nauthors==5
drop if auth6==. & nauthors==6
drop if auth7==. & nauthors==7
keep articleid year auth1 auth2 auth3 auth4 auth5 auth6 auth7 nauthors year2 
bys articleid: gen t=_n
reshape long auth, i(articleid t) j(n)
drop if auth==.
gen x=1
drop if nauthors<0
bys auth: egen snpapers5=sum(x)
drop if nauthors==1
expand nauthors
sort articleid auth
by articleid auth: gen numid2 = _n
by articleid: gen auth2 = auth[nauthors * numid2]
drop if auth==auth2
bys auth: gen d=_n
bys auth: egen degree=max(d)
egen newid = group(auth auth2)
bys newid: gen intensity = _N
bys newid: gen intensityd =intensity/(nauthors-1)
keep auth auth2 newid intensity snpapers5 year2 intensityd
duplicates drop
bys auth: egen strength=mean(intensity)
bys auth: egen strengths=sum(intensity)
bys auth: egen strengthd=mean(intensityd)
bys auth: egen strengthsd=sum(intensityd)
bys auth: gen strengthppaper=strength/snpapers5
bys auth: gen strengthppaperd=strengthd/snpapers5
keep auth strength strengthppaper strengths strengthd strengthppaperd strengthsd year2
duplicates drop
replace year2=`i'
save strength`i',  replace 
}

use strength1963, clear
forvalues i=1964/1999{
append using strength`i'
}
rename year2 year
save strength, replace


/* 2) Merging all the network files obtained from R*/
/*Run this part of the code after running "netprodbygender_soc.R" and "netprod_7authorsbygender_soc.R" in R. */
set more off
forval i = 1967/1999{
insheet using "network`i'_5y.csv",clear
gen year=`i'
drop v1
rename vgauth auth
sort auth year
order auth year
rename vgdeg degree
rename vgdeg2 degree2
rename vgnetprod netprod
rename vgnetprod2 netprod2
rename vgtran clustering
save network`i'_5y,replace
}
use network1967_5y, clear
forval i = 1968/1999{
append using network`i'_5y
}
drop if degree==0
save network_5y, replace

forval i = 1967/1999{
insheet using "networkgc`i'_5y.csv",clear
gen year=`i'
drop v1
rename vgcauth auth
sort auth year
order auth year
rename vgcdeg degreegc
rename vgcbet betweenness
rename vgccl closeness
rename vgcev eigenvector
rename vgctr clusteringgc
save networkgc`i'_5y,replace
}

use networkgc1967_5y, clear
forval i = 1968/1999{
append using networkgc`i'_5y
}
drop if degreegc==0
save networkgc_5y, replace

use networkgc_5y, clear
joinby auth year using network_5y, unmatched(both)
drop _merge
save network_5y, replace


/* 3) Creating panel data at the author-year level from articles*/
use sociology, clear
gen year1 = substr(year,1,4)
drop year journal AP keywords language field title
rename year1 year

rename author1 author
joinby author using authors, unmatched(master)
rename authid auth1
drop author 
rename author2 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth2
drop author 
rename author3 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth3
drop author
rename author4 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth4
drop author
rename author5 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth5
drop author 
rename author6 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth6
drop author 
rename author7 author
drop _merge
joinby author using authors, unmatched(master)
rename authid auth7
drop author 
drop _merge

gen npapers = 1

foreach i in auth1 auth2 auth3 auth4 auth5 auth6 auth7{
gen d`i'=1 if `i'!=.
replace d`i'=0 if missing(d`i')
}
egen nauthors=rowtotal(dauth1 dauth2 dauth3 dauth4 dauth5 dauth6 dauth7)

drop if nauthors==0
drop if missing(year)

drop dauth1 dauth2 dauth3 dauth4 dauth5 dauth6 dauth7

export delimited using "sociology.csv", replace

joinby articleid using sj, unmatched(master)
drop _merge
joinby articleid using impactfactor, unmatched(master)
drop _merge

gen prodSJR= npapers*SJR /*this is not correct, each article has a different impact factor*/
gen prodH =npapers*Hindex
gen prodCites = npapers*CitesDoc2years

gen proddSJR=prodSJR/nauthors
gen proddH=prodH/nauthors
gen proddCites=prodCites/nauthors

gen t=_n
reshape long auth, i(t) 
sort auth year
drop t _j
drop if auth==.

bys auth year: egen sproddSJR=sum(proddSJR)
bys auth year: egen sproddH=sum(proddH)
bys auth year: egen sproddCites=sum(proddCites)

bys auth year: egen sprodSJR=sum(prodSJR)
bys auth year: egen sprodH=sum(prodH)
bys auth year: egen sprodCites=sum(prodCites)

bys auth year: egen mprodSJR=max(prodSJR)
bys auth year: egen mprodH=max(prodH)
bys auth year: egen mprodCites=max(prodCites)

bys auth year: egen mnauthors=mean(nauthors)
bys auth year: gen snpapers=_N

gen co=1 if nauthors>1 
replace co=0 if nauthors<=1

bys auth year: egen ncopapers=sum(co)
gen coauthorship=ncopapers/snpapers

replace snpapers=0 if missing(snpapers)
replace sprodSJR=0 if missing(sprodSJR)
replace sprodH=0 if missing(sprodH)
replace sprodCites=0 if missing(sprodCites)
replace sproddSJR=0 if missing(sproddSJR)
replace sproddH=0 if missing(sproddH)
replace sproddCites=0 if missing(sproddCites)
replace mprodSJR=0 if missing(mprodSJR)
replace mprodH=0 if missing(mprodH)
replace mprodCites=0 if missing(mprodCites)

foreach i of varlist sj1-sj149{
bys auth year: egen s`i'=sum(`i')
drop `i'
rename s`i' `i'
}
drop SJR Hindex CitesDoc2years prodSJR prodH prodCites proddSJR proddH proddCites nauthors npapers articleid co ncopapers 
duplicates drop
merge auth year using network_5y, sort
drop _merge
merge auth year using strength, sort
drop _merge
xtset auth year, yearly
bys auth: egen ystart=min(year)
bys auth: egen yend=max(year)
save sociologydatabase, replace

collapse (mean) year, by(auth)
keep auth
save authorlist, replace
set more off
clear 
set obs 37
gen year=_n+1962
cross using authorlist

sort auth year
merge auth year using sociologydatabase, sort
drop _merge

egen mystart=mean(ystart), by(auth)
drop if year<mystart

egen myend=mean(yend), by(auth)
drop if year>myend

drop ystart yend
rename mystart ystart
rename myend yend

compress

save sociologydatabase, replace


gen t = year - ystart 
gen lbet=log(betweenness+1)

replace snpapers=0 if missing(snpapers)
replace sprodSJR=0 if missing(sprodSJR)
replace sprodH=0 if missing(sprodH)
replace sprodCites=0 if missing(sprodCites)
replace sproddSJR=0 if missing(sproddSJR)
replace sproddH=0 if missing(sproddH)
replace sproddCites=0 if missing(sproddCites)
replace mprodSJR=0 if missing(mprodSJR)
replace mprodH=0 if missing(mprodH)
replace mprodCites=0 if missing(mprodCites)

set more off
foreach i of varlist sj1-sj149{
replace `i'=0 if missing(`i')
}

save sociologydatabase, replace

use sociologydatabase, clear
xtset auth year, yearly
foreach i of varlist sproddCites snpapers{
gen `i'L1=L.`i' 
gen `i'L2=L2.`i' 
gen `i'L3=L3.`i' 
gen `i'L4=L4.`i' 
gen `i'S5=`i'L4+`i'L3+`i'L2+`i'L1+`i' 
drop `i'L1 `i'L2 `i'L3 `i'L4
}

save sociologydatabase, replace

bys auth (year): gen cprodd=sum( sproddCites)
gen cproddl5= cprodd-sproddCitesS5
bys auth: egen mcproddl5=mean(cproddl5) 	

foreach i in degree clustering lbet cproddl5 mcproddl5 strengthppaper{
sum `i'
gen `i'z=(`i'-r(mean))/r(sd)
}

set more off
foreach i of varlist sj1-sj149{
replace `i'=0 if missing(`i')
gen `i'L1=L.`i' 
gen `i'L2=L2.`i' 
gen `i'L3=L3.`i' 
gen `i'L4=L4.`i' 
gen `i'S5=`i'L4+`i'L3+`i'L2+`i'L1+`i'
replace `i'S5= `i'S5/snpapersS5 if snpapersS5!=0
drop `i'L1 `i'L2 `i'L3 `i'L4 `i'
rename `i'S5 `i'
}


use sociologydatabase, clear

bys auth (year): gen cprodd=sum( sproddCites)
gen cproddl5= cprodd-sproddCitesS5
bys auth: egen mcproddl5=mean(cproddl5) 	

foreach i in degree clustering lbet cproddl5 mcproddl5 strengthppaper{
sum `i'
gen `i'z=(`i'-r(mean))/r(sd)
}

set more off
foreach i of varlist sj1-sj149{
replace `i'=0 if missing(`i')
gen `i'L1=L.`i' 
gen `i'L2=L2.`i' 
gen `i'L3=L3.`i' 
gen `i'L4=L4.`i' 
gen `i'S5=`i'L4+`i'L3+`i'L2+`i'L1+`i'
replace `i'S5= `i'S5/snpapersS5 if snpapersS5!=0
drop `i'L1 `i'L2 `i'L3 `i'L4 `i'
rename `i'S5 `i'
}
rename auth authid
joinby authid using femaleapi_soc_api_old, unmatched(master)
duplicates drop
bys auth: egen mfemaleapi=max(femaleapi)
drop femaleapi
rename mfemaleapi femaleapi
duplicates drop
xtset auth year, yearly
rename authid auth 
drop _merge
merge 1:1 auth year using ncopapers
drop if _merge==2
drop _merge
replace ncopapers=0 if missing(ncopapers)
xtset auth year
gen coauthorship5y=(ncopapers+L.ncopapers+L2.ncopapers+L3.ncopapers+L4.ncopapers)/snpapersS5
gen sproddSJR5y=sproddSJR+L.sproddSJR+L2.sproddSJR+L3.sproddSJR+L4.sproddSJR
save sociologydatabase, replace






/***** 1) Adding femaleapi from femaleapi.api to the sociology database*****/

use authors, clear
split author, p(,)
split author2, p(" ")
gen firstname=lower(author21)
merge m:1 firstname using "gender_countryapi.dta"
drop if _merge==2
gen femaleapi=1 if pfemale>=0.95 & pfemale<.
replace femaleapi=0 if pmale>=0.95 & pmale<.
keep authid femaleapi 
save gender_soc_api, replace


/*number of coauthored papers*/
import delimited "sociology.csv", clear
gen t=_n
reshape long auth gender, i(t) 
sort auth year
drop t _j
drop if auth==.
gen co=1 if nauthors>1 
replace co=0 if nauthors<=1
bys auth year: egen ncopapers=sum(co)
keep auth year ncopapers 
duplicates drop
save ncopapers, replace


/*Coauthors' characteristics*/

use "sociologydatabase", clear 
keep auth year t
save "t.dta", replace  

import delimited "sociology.csv", clear
joinby articleid using "gender_soc_api", unmatched(master)
drop _merge

joinby articleid using "impactfactor", unmatched(master)
drop _merge

bys articleid: gen t=_n
gen prodd=SJR/nauthors
drop CitesDoc2years Hindex 
bys articleid: egen prodd2=max(prodd)
bys articleid: egen prod=max(SJR)
drop prodd  SJR 
rename prodd2 prodd

reshape long auth femaleapi, i(articleid t) j(n)
drop if auth==.
bys auth: egen myear=min(year)
drop t n
bys articleid: gen t=_n
duplicates drop
reshape wide auth femaleapi myear, i(articleid) j(t)
drop auth8 femaleapi8 myear8 auth9 femaleapi9 myear9 auth10 femaleapi10 myear10 auth11 femaleapi11 myear11 auth12 femaleapi12 myear12
order articleid auth1 femaleapi1 myear1 auth2 femaleapi2 myear2 auth3 femaleapi3 myear3 auth4 femaleapi4 myear4 auth5 femaleapi5 myear5 auth6 femaleapi6 myear6 auth7 femaleapi7 myear7 year npapers prod nauthors prodd
export delimited "networkfemaleapisoct.csv", replace
 
 
set more off
forval i = 1974/1999{
insheet using "network`i'_5y.csv",clear
gen year=`i'
drop v1
rename vgauth auth
sort auth year
order auth year
rename vgdeg degreex 
rename vgnetprodmale netprodm
rename vgnetprodfemale netprodf
rename vgpapersmale neighpapersm
rename vgpapersfemale neighpapersf
rename vgnmaleco namecom
rename vgnfemaleco namecof
rename vgtmale avgtcom
rename vgtfemale avgtcof
save neighchar`i'_5y,replace
}
use neighchar1974_5y, clear
forval i = 1975/1999{
append using neighchar`i'_5y
}
drop if degreex==0
save neighchar_5y, replace

